1.1. Notebook for WeatherType Classification#

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
data_filename="weather_classification_data.csv"
normalized_database_filename="normalized_weather_table.db"
import sqlite3
from sqlite3 import Error
import os

# Function to create a database connection
def create_connection(db_file, delete_db=False):
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn

# Function to create a table
def create_table(conn, create_table_sql, drop_table_name=None):
    if drop_table_name:
        try:
            c = conn.cursor()
            c.execute(f"DROP TABLE IF EXISTS {drop_table_name}")
        except Error as e:
            print(e)

    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)

# Function to execute a SQL statement
def execute_sql_statement(sql_statement, conn):
    cur = conn.cursor()
    cur.execute(sql_statement)
    rows = cur.fetchall()
    return rows

# Function to insert data into the table
def insert_function(sql, conn, values):
    cur = conn.cursor()
    cur.executemany(sql, values)
    conn.commit()
    return cur.lastrowid
# Create tables for normalized data
def step1_create_tables(normalized_database_filename):
    conn_norm = create_connection(normalized_database_filename, True)

    # Create Location table
    create_location_table_sql = """CREATE TABLE Location (
        LocationID INTEGER PRIMARY KEY AUTOINCREMENT,
        Location TEXT NOT NULL
    );"""
    create_table(conn_norm, create_location_table_sql, "Location")

    # Create WeatherType table
    create_weather_type_table_sql = """CREATE TABLE WeatherType (
        WeatherTypeID INTEGER PRIMARY KEY AUTOINCREMENT,
        WeatherType TEXT NOT NULL
    );"""
    create_table(conn_norm, create_weather_type_table_sql, "WeatherType")

    # Create Season table
    create_season_table_sql = """CREATE TABLE SeasonType (
        SeasonID INTEGER PRIMARY KEY AUTOINCREMENT,
        Season TEXT NOT NULL
    );"""
    create_table(conn_norm, create_season_table_sql, "Season")

    # Create CloudCover table
    create_cloud_cover_table_sql = """CREATE TABLE CloudCover (
        CloudCoverID INTEGER PRIMARY KEY AUTOINCREMENT,
        CloudCoverDescription TEXT NOT NULL
    );"""
    create_table(conn_norm, create_cloud_cover_table_sql, "CloudCover")

    # Create UVIndex table
    create_uv_index_table_sql = """CREATE TABLE UVIndex (
        UVIndexID INTEGER PRIMARY KEY AUTOINCREMENT,
        UVIndex REAL NOT NULL
    );"""
    create_table(conn_norm, create_uv_index_table_sql, "UVIndex")

    # Create Weather table (main table)
    create_weather_table_sql = """CREATE TABLE Weather (
        WeatherID INTEGER PRIMARY KEY AUTOINCREMENT,
        Temperature REAL,
        Humidity REAL,
        WindSpeed REAL,
        Precipitation REAL,
        AtmosphericPressure REAL,
        Visibility REAL,
        LocationID INTEGER,
        WeatherTypeID INTEGER,
        SeasonID INTEGER,
        CloudCoverID INTEGER,
        UVIndexID INTEGER,
        FOREIGN KEY (LocationID) REFERENCES Location (LocationID),
        FOREIGN KEY (WeatherTypeID) REFERENCES WeatherType (WeatherTypeID),
        FOREIGN KEY (SeasonID) REFERENCES Season (SeasonID),
        FOREIGN KEY (CloudCoverID) REFERENCES CloudCover (CloudCoverID),
        FOREIGN KEY (UVIndexID) REFERENCES UVIndex (UVIndexID)
    );"""
    create_table(conn_norm, create_weather_table_sql, "Weather")

    conn_norm.close()
# Function to insert data into the Location table
def insert_location_data(data_filename, normalized_database_filename):
    conn = sqlite3.connect(normalized_database_filename)
    cursor = conn.cursor()

    locations = set()
    with open(data_filename, 'r') as file:
        header = None
        for line in file:
            line = line.strip()
            if not line:
                continue
            if not header:
                header = line
                continue
            values = line.split(',')
            location = values[9]  # Assuming Location is the 9th column (index 8)
            locations.add((location,))

    cursor.executemany("INSERT INTO Location (Location) VALUES (?)", sorted(locations))
    conn.commit()
    conn.close()
# Function to insert data into the WeatherType table
def insert_weather_type_data(data_filename, normalized_database_filename):
    conn = sqlite3.connect(normalized_database_filename)
    cursor = conn.cursor()

    weather_types = set()
    with open(data_filename, 'r') as file:
        header = None
        for line in file:
            line = line.strip()
            if not line:
                continue
            if not header:
                header = line
                continue
            values = line.split(',')
            weather_type = values[10]  # Assuming Weather Type is the 10th column (index 9)
            weather_types.add((weather_type,))

    cursor.executemany("INSERT INTO WeatherType (WeatherType) VALUES (?)", sorted(weather_types))
    conn.commit()
    conn.close()
# Function to insert data into the Season table
def insert_season_data(data_filename, normalized_database_filename):
    conn = sqlite3.connect(normalized_database_filename)
    cursor = conn.cursor()

    seasons = set()
    with open(data_filename, 'r') as file:
        header = None
        for line in file:
            line = line.strip()
            if not line:
                continue
            if not header:
                header = line
                continue
            values = line.split(',')
            season = values[7]  # Assuming Season is the 8th column (index 7)
            seasons.add((season,))

    cursor.executemany("INSERT INTO SeasonType (Season) VALUES (?)", sorted(seasons))
    conn.commit()
    conn.close()
# Function to insert data into the CloudCover table
def insert_cloud_cover_data(data_filename, normalized_database_filename):
    conn = sqlite3.connect(normalized_database_filename)
    cursor = conn.cursor()

    cloud_covers = set()
    with open(data_filename, 'r') as file:
        header = None
        for line in file:
            line = line.strip()
            if not line:
                continue
            if not header:
                header = line
                continue
            values = line.split(',')
            cloud_cover = values[4]  # Assuming Cloud Cover is the 5th column (index 4)
            cloud_covers.add((cloud_cover,))

    cursor.executemany("INSERT INTO CloudCover (CloudCoverDescription) VALUES (?)", sorted(cloud_covers))
    conn.commit()
    conn.close()
# Function to insert data into the UVIndex table
def insert_uv_index_data(data_filename, normalized_database_filename):
    conn = sqlite3.connect(normalized_database_filename)
    cursor = conn.cursor()

    uv_indexes = set()
    with open(data_filename, 'r') as file:
        header = None
        for line in file:
            line = line.strip()
            if not line:
                continue
            if not header:
                header = line
                continue
            values = line.split(',')
            uv_index = float(values[6])  # Assuming UV Index is the 6th column (index 5)
            uv_indexes.add((uv_index,))

    cursor.executemany("INSERT INTO UVIndex (UVIndex) VALUES (?)", sorted(uv_indexes))
    conn.commit()
    conn.close()
step1_create_tables("normalized_weather_table.db")
insert_location_data(data_filename, normalized_database_filename)
insert_weather_type_data(data_filename, normalized_database_filename)
insert_season_data(data_filename, normalized_database_filename)
insert_cloud_cover_data(data_filename, normalized_database_filename)
insert_uv_index_data(data_filename, normalized_database_filename)
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[11], line 1
----> 1 insert_location_data(data_filename, normalized_database_filename)
      2 insert_weather_type_data(data_filename, normalized_database_filename)
      3 insert_season_data(data_filename, normalized_database_filename)

Cell In[5], line 7, in insert_location_data(data_filename, normalized_database_filename)
      4 cursor = conn.cursor()
      6 locations = set()
----> 7 with open(data_filename, 'r') as file:
      8     header = None
      9     for line in file:

File ~/pyenv_book/lib/python3.12/site-packages/IPython/core/interactiveshell.py:324, in _modified_open(file, *args, **kwargs)
    317 if file in {0, 1, 2}:
    318     raise ValueError(
    319         f"IPython won't let you open fd={file} by default "
    320         "as it is likely to crash IPython. If you know what you are doing, "
    321         "you can use builtins' open."
    322     )
--> 324 return io_open(file, *args, **kwargs)

FileNotFoundError: [Errno 2] No such file or directory: 'weather_classification_data.csv'
# Function to insert data into the Weather table (main data table)
def insert_weather_data(data_filename, normalized_database_filename):
    conn = sqlite3.connect(normalized_database_filename)
    cursor = conn.cursor()

    weather_data = []
    with open(data_filename, 'r') as file:
        header = None
        for line in file:
            line = line.strip()
            if not line:
                continue
            if not header:
                header = line
                continue
            values = line.split(',')
            temperature = float(values[0])  # Temperature (1st column)
            humidity = float(values[1])  # Humidity (2nd column)
            wind_speed = float(values[2])  # Wind Speed (3rd column)
            precipitation = float(values[3])  # Precipitation (4th column)
            cloud_cover = values[4]  # Cloud Cover (5th column)
            atmospheric_pressure = float(values[5])  # Atmospheric Pressure (6th column)
            uv_index = float(values[6])  # UV Index (7th column)
            season = values[7]  # Season (8th column)
            Visibility = values[8]
            location = values[9]  # Location (9th column)
            weather_type = values[10]  # Weather Type (10th column)

            # Get corresponding IDs for categorical data from normalized tables
            location_id = get_id_from_table("Location", "LocationID", "Location", location, conn)
            weather_type_id = get_id_from_table("WeatherType", "WeatherTypeID", "WeatherType", weather_type, conn)
            season_id = get_id_from_table("SeasonType", "SeasonID", "Season", season, conn)
            cloud_cover_id = get_id_from_table("CloudCover", "CloudCoverID", "CloudCoverDescription", cloud_cover, conn)
            uv_index_id = get_id_from_table("UVIndex", "UVIndexID", "UVIndex", uv_index, conn)

            # Append values for insertion into Weather table
            weather_data.append((temperature, humidity, wind_speed, precipitation, atmospheric_pressure,Visibility, location_id, weather_type_id, season_id, cloud_cover_id, uv_index_id))

    cursor.executemany("""
    INSERT INTO Weather (Temperature, Humidity, WindSpeed, Precipitation, AtmosphericPressure,Visibility, LocationID, WeatherTypeID, SeasonID, CloudCoverID, UVIndexID)
    VALUES (?,?,?,?,?,?,?,?,?,?,?)""", weather_data)
    conn.commit()
    conn.close()

# Function to get IDs for categorical values
def get_id_from_table(table_name, id_column, column_name, value, conn):
    cursor = conn.cursor()
    cursor.execute(f"SELECT {id_column} FROM {table_name} WHERE {column_name}=?", (value,))
    result = cursor.fetchone()
    return result[0] if result else None
insert_weather_data(data_filename, normalized_database_filename)
import sqlite3
import pandas as pd

def fetch_data_from_db(normalized_database_filename):
    conn = sqlite3.connect(normalized_database_filename)

    # SQL join query to combine the tables (using example of joined tables)
    sql_statement="""SELECT
    w.Temperature,w.Humidity,w.WindSpeed,w.Precipitation,c.CloudCoverDescription,w.AtmosphericPressure,u.UVIndex,s.Season,w.Visibility,l.Location,t.WeatherType
    FROM
        Weather w
    JOIN
        CloudCover c ON w.CloudCoverID = c.CloudCoverID
    JOIN
        UVIndex u ON w.UVIndexID = u.UVIndexID
    JOIN
        SeasonType s ON w.SeasonID = s.SeasonID
    JOIN
        Location l ON w.LocationID = l.LocationID
    JOIN
        WeatherType t ON w.WeatherTypeID = t.WeatherTypeID;"""

    # Fetching data into a Pandas DataFrame
    df = pd.read_sql_query(sql_statement, conn)

    conn.close()
    return df
df = fetch_data_from_db(normalized_database_filename)
display(df)
Temperature Humidity WindSpeed Precipitation CloudCoverDescription AtmosphericPressure UVIndex Season Visibility Location WeatherType
0 14.0 73.0 9.5 82.0 partly cloudy 1010.82 2.0 Winter 3.5 inland Rainy
1 39.0 96.0 8.5 71.0 partly cloudy 1011.43 7.0 Spring 10.0 inland Cloudy
2 30.0 64.0 7.0 16.0 clear 1018.72 5.0 Spring 5.5 mountain Sunny
3 38.0 83.0 1.5 82.0 clear 1026.25 7.0 Spring 1.0 coastal Sunny
4 27.0 74.0 17.0 66.0 overcast 990.67 1.0 Winter 2.5 mountain Rainy
... ... ... ... ... ... ... ... ... ... ... ...
13195 10.0 74.0 14.5 71.0 overcast 1003.15 1.0 Summer 1.0 mountain Rainy
13196 -1.0 76.0 3.5 23.0 cloudy 1067.23 1.0 Winter 6.0 coastal Snowy
13197 30.0 77.0 5.5 28.0 overcast 1012.69 3.0 Autumn 9.0 coastal Cloudy
13198 3.0 76.0 10.0 94.0 overcast 984.27 0.0 Winter 2.0 inland Snowy
13199 -5.0 38.0 0.0 92.0 overcast 1015.37 5.0 Autumn 10.0 mountain Rainy

13200 rows × 11 columns

#Checking
df2=pd.read_csv(data_filename)
df1=df
import pandas as pd

# Assuming df1 and df2 are the two DataFrames to compare

# Option 1: Reset indexes and ignore column names
df1_sorted = df1.sort_values(by=list(df1.columns)).reset_index(drop=True)
df2_sorted = df2.sort_values(by=list(df2.columns)).reset_index(drop=True)

# Compare the DataFrames after sorting
are_equal = df1_sorted.values.tolist() == df2_sorted.values.tolist()

# Print the result
if are_equal:
    print("The DataFrames have the same data (ignoring column headers).")
else:
    print("The DataFrames have different data.")
The DataFrames have the same data (ignoring column headers).
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

profile = ProfileReport(df, title="Pandas Profiling Report")
profile

#There is no missing values in any columns as the total row cont 13200 matches with entries in each column,
#Also gives Column Names and its datatypes, where 4 columns are categorical columns 
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Temperature            13200 non-null  float64
 1   Humidity               13200 non-null  float64
 2   WindSpeed              13200 non-null  float64
 3   Precipitation          13200 non-null  float64
 4   CloudCoverDescription  13200 non-null  object 
 5   AtmosphericPressure    13200 non-null  float64
 6   UVIndex                13200 non-null  float64
 7   Season                 13200 non-null  object 
 8   Visibility             13200 non-null  float64
 9   Location               13200 non-null  object 
 10  WeatherType            13200 non-null  object 
dtypes: float64(7), object(4)
memory usage: 1.1+ MB
categorical_columns =["CloudCoverDescription","Season","Location","WeatherType"]
for coln in categorical_columns:
    df[coln].value_counts()
CloudCoverDescription
overcast         6090
partly cloudy    4560
clear            2139
cloudy            411
Name: count, dtype: int64
Season
Winter    5610
Spring    2598
Autumn    2500
Summer    2492
Name: count, dtype: int64
Location
inland      4816
mountain    4813
coastal     3571
Name: count, dtype: int64
WeatherType
Rainy     3300
Cloudy    3300
Sunny     3300
Snowy     3300
Name: count, dtype: int64
# Summary of Numerical Attributes
df.describe()
Temperature Humidity WindSpeed Precipitation AtmosphericPressure UVIndex Visibility
count 13200.000000 13200.000000 13200.000000 13200.000000 13200.000000 13200.000000 13200.000000
mean 19.127576 68.710833 9.832197 53.644394 1005.827896 4.005758 5.462917
std 17.386327 20.194248 6.908704 31.946541 37.199589 3.856600 3.371499
min -25.000000 20.000000 0.000000 0.000000 800.120000 0.000000 0.000000
25% 4.000000 57.000000 5.000000 19.000000 994.800000 1.000000 3.000000
50% 21.000000 70.000000 9.000000 58.000000 1007.650000 3.000000 5.000000
75% 31.000000 84.000000 13.500000 82.000000 1016.772500 7.000000 7.500000
max 109.000000 109.000000 48.500000 109.000000 1199.210000 14.000000 20.000000
import matplotlib.pyplot as plt

df.hist(bins=50, figsize=(12, 8))
plt.show()
array([[<Axes: title={'center': 'Temperature'}>,
        <Axes: title={'center': 'Humidity'}>,
        <Axes: title={'center': 'WindSpeed'}>],
       [<Axes: title={'center': 'Precipitation'}>,
        <Axes: title={'center': 'AtmosphericPressure'}>,
        <Axes: title={'center': 'UVIndex'}>],
       [<Axes: title={'center': 'Visibility'}>, <Axes: >, <Axes: >]],
      dtype=object)
../../_images/62b23b52a39bf7cdf229d604ad22d0a05bfd7edb80a7e76ce9e35af7078c6d7b.png

When Not to Apply Log Transformation: If your data contains non-positive values, like zero or negative values, log transformation should be skipped or an alternative method should be used. For columns with only positive values and no significant skewness, other preprocessing methods like MinMaxScaler, StandardScaler, or OneHotEncoding might be more appropriate.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Assuming `data` is your DataFrame
def check_log_transformation(df):
    features_to_transform = []

    for column in df.columns:
        if df[column].dtype != 'object':  # Skip non-numeric columns
            print(column)
            # Check for log transformation necessity
            if (df[column] > 0).all():  # Ensure no negative or zero values
                log_transformed = np.log1p(df[column])  # Use log1p to avoid zero issues
                # Compare variance before and after transformation
                if np.var(df[column]) > np.var(log_transformed):
                    features_to_transform.append(column)
                else:
                    print(f"{column}: Log transformation not necessary.")
            else:
                print(f"{column}: Contains non-positive values, log transformation skipped.")
    
    return features_to_transform

# Example usage
# Assuming you have a train-test split
# train_df = ... (training dataset)
# test_df = ... (testing dataset)

features_to_log_transform = check_log_transformation(df)
# Train models with log-transformed data
# Example: logistic regression model
Temperature
Temperature: Contains non-positive values, log transformation skipped.
Humidity
WindSpeed
WindSpeed: Contains non-positive values, log transformation skipped.
Precipitation
Precipitation: Contains non-positive values, log transformation skipped.
AtmosphericPressure
UVIndex
UVIndex: Contains non-positive values, log transformation skipped.
Visibility
Visibility: Contains non-positive values, log transformation skipped.

1.2. Split the data and possibly stratify!#

  • We know that Precipitation,Temperature,UVIndex are the most important features so far. Let’s split the data into training and testing sets, ensuring that both sets have the same distribution of these categories.

df.head(5)
Temperature Humidity WindSpeed Precipitation CloudCoverDescription AtmosphericPressure UVIndex Season Visibility Location WeatherType
0 14.0 73.0 9.5 82.0 partly cloudy 1010.82 2.0 Winter 3.5 inland Rainy
1 39.0 96.0 8.5 71.0 partly cloudy 1011.43 7.0 Spring 10.0 inland Cloudy
2 30.0 64.0 7.0 16.0 clear 1018.72 5.0 Spring 5.5 mountain Sunny
3 38.0 83.0 1.5 82.0 clear 1026.25 7.0 Spring 1.0 coastal Sunny
4 27.0 74.0 17.0 66.0 overcast 990.67 1.0 Winter 2.5 mountain Rainy
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Check class distribution in the target variable 'WeatherType'
print("\nWeatherType distribution in the dataset:")
print(df["WeatherType"].value_counts())

# Plot the distribution of WeatherType
df["WeatherType"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Weather Type")
plt.ylabel("Number of Observations")
plt.title("Distribution of Weather Type")
plt.show()
WeatherType distribution in the dataset:
WeatherType
Rainy     3300
Cloudy    3300
Sunny     3300
Snowy     3300
Name: count, dtype: int64
<Axes: xlabel='WeatherType'>
Text(0.5, 0, 'Weather Type')
Text(0, 0.5, 'Number of Observations')
Text(0.5, 1.0, 'Distribution of Weather Type')
../../_images/73b4db4ddf09bcd89e42c9c3e30e9f6b3a479648f0cb7601d62ef38a3a803514.png
# Stratify based on WeatherType
strat_train_set, strat_test_set = train_test_split(df, test_size=0.20, stratify=df["WeatherType"], random_state=42)

# Check class distribution in training and testing sets
print("\nWeatherType distribution in training set:")
print(strat_train_set["WeatherType"].value_counts(normalize=True))

print("\nWeatherType distribution in testing set:")
print(strat_test_set["WeatherType"].value_counts(normalize=True))

# Remove any temporary or stratification column if created (not needed here)
# If "income_cat" or similar column was created, drop it as done in your earlier code
# Example: strat_train_set.drop("temp_column", axis=1, inplace=True)

# Verify the train-test split
print("\nTraining set shape:", strat_train_set.shape)
print("Testing set shape:", strat_test_set.shape)

# Copy the training set for further operations
weather_train = strat_train_set.copy()
weather_test = strat_test_set.copy()
print("\nTraining data copy created for further use.")
WeatherType distribution in training set:
WeatherType
Snowy     0.25
Sunny     0.25
Rainy     0.25
Cloudy    0.25
Name: proportion, dtype: float64

WeatherType distribution in testing set:
WeatherType
Rainy     0.25
Cloudy    0.25
Sunny     0.25
Snowy     0.25
Name: proportion, dtype: float64

Training set shape: (10560, 11)
Testing set shape: (2640, 11)

Training data copy created for further use.

1.2.1. Observation from Correlation Matrix#

From the correlation Matrix from y-profile, we could see the target variable WeatherType is highly correlated with Temperature, Precipitation.

Temperature,Precipitation,Visibility,CloudCoverDescription,Humidity are most relevant for predicting weatherType.

Season, Windspeed, Location have very low correlation with weatherType, and other features, indicating limited predictive utility.

# Install pandas-profiling if not already installed: pip install ydata-profiling
from ydata_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Generate a profile report for the training data
profile = ProfileReport(weather_train, title="Weather Data Profile", explorative=True)
profile.to_notebook_iframe()  # Displays within a notebook



# Observations and data cleanup tasks
# (Make notes here based on the profile and correlation findings.)
# Observations about features
print("\nTraining Data Description:")
print(weather_train.describe())

# Checking for missing values
missing_values = weather_train.isnull().sum()
print("\nMissing Values in Training Data:")
print(missing_values[missing_values > 0])

# Plotting distributions of numeric features
numeric_features = ["AtmosphericPressure", "Humidity", "Temperature", "WindSpeed", "UVIndex", "Visibility", "Precipitation"]
weather_train[numeric_features].hist(bins=20, figsize=(12, 8))
plt.suptitle("Distribution of Numeric Features")
plt.show()

# Categorical value distributions
categorical_features = ["CloudCoverDescription", "Season", "Location"]
for col in categorical_features:
    print(f"\nValue Counts for {col}:")
    print(weather_train[col].value_counts())
    weather_train[col].value_counts().plot.bar()
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

# List of cleanup tasks
print("\nData Cleanup Tasks:")
cleanup_tasks = [
    "Impute missing values if any.",
    "Scale numeric features (e.g., StandardScaler).",
    "Encode categorical features using OneHotEncoder.",
    "Address potential outliers in numeric features (e.g., log transformation).",
    "Check for capped values in numeric features."
]
for task in cleanup_tasks:
    print("- " + task)
Training Data Description:
        Temperature      Humidity     WindSpeed  Precipitation  \
count  10560.000000  10560.000000  10560.000000   10560.000000   
mean      19.080682     68.559091      9.882765      53.605398   
std       17.331979     20.173429      6.898173      31.999995   
min      -25.000000     20.000000      0.000000       0.000000   
25%        4.000000     57.000000      5.000000      19.000000   
50%       21.000000     70.000000      9.000000      58.000000   
75%       31.000000     83.000000     13.500000      82.000000   
max      109.000000    109.000000     48.500000     109.000000   

       AtmosphericPressure       UVIndex    Visibility  
count         10560.000000  10560.000000  10560.000000  
mean           1005.656206      4.001042      5.467235  
std              37.313099      3.856884      3.385381  
min             800.230000      0.000000      0.000000  
25%             994.690000      1.000000      3.000000  
50%            1007.620000      3.000000      5.000000  
75%            1016.780000      7.000000      7.500000  
max            1198.850000     14.000000     20.000000  

Missing Values in Training Data:
Series([], dtype: int64)
array([[<Axes: title={'center': 'AtmosphericPressure'}>,
        <Axes: title={'center': 'Humidity'}>,
        <Axes: title={'center': 'Temperature'}>],
       [<Axes: title={'center': 'WindSpeed'}>,
        <Axes: title={'center': 'UVIndex'}>,
        <Axes: title={'center': 'Visibility'}>],
       [<Axes: title={'center': 'Precipitation'}>, <Axes: >, <Axes: >]],
      dtype=object)
Text(0.5, 0.98, 'Distribution of Numeric Features')
../../_images/d508f5effe0f196d9b3545f4a0ee77d01ba32bc61101e13ecd8e1a932792af93.png
Value Counts for CloudCoverDescription:
CloudCoverDescription
overcast         4876
partly cloudy    3654
clear            1704
cloudy            326
Name: count, dtype: int64
<Axes: xlabel='CloudCoverDescription'>
Text(0.5, 1.0, 'Distribution of CloudCoverDescription')
Text(0.5, 0, 'CloudCoverDescription')
Text(0, 0.5, 'Frequency')
../../_images/614e0dfb1e52a8bd345a4772826135731b32a1f21ca503e9d7cac92ec92a2271.png
Value Counts for Season:
Season
Winter    4506
Spring    2048
Autumn    2021
Summer    1985
Name: count, dtype: int64
<Axes: xlabel='Season'>
Text(0.5, 1.0, 'Distribution of Season')
Text(0.5, 0, 'Season')
Text(0, 0.5, 'Frequency')
../../_images/82140e3f7d57d652605979bf9c54bcc38bd5126ed310785ca7e5e85a60bd445c.png
Value Counts for Location:
Location
inland      3856
mountain    3829
coastal     2875
Name: count, dtype: int64
<Axes: xlabel='Location'>
Text(0.5, 1.0, 'Distribution of Location')
Text(0.5, 0, 'Location')
Text(0, 0.5, 'Frequency')
../../_images/022cfa999daf2ad860234c8b80d410266aeb5a08055fdacb4340c6b181cc484d.png
Data Cleanup Tasks:
- Impute missing values if any.
- Scale numeric features (e.g., StandardScaler).
- Encode categorical features using OneHotEncoder.
- Address potential outliers in numeric features (e.g., log transformation).
- Check for capped values in numeric features.

The outputs represent the performance of your Logistic Regression model at different stages of evaluation and optimization. Here’s a detailed explanation of each part:

Cross-Validation Results 3-Fold Cross-Validation:

Mean = 0.871: The average accuracy across the 3 folds of cross-validation is 87.1%. Std = 0.003: The standard deviation of the accuracy scores across the folds is 0.3%. This indicates the model performs consistently across the folds, as the variance is very low. 10-Fold Cross-Validation:

Mean = 0.872: The average accuracy across 10 folds is slightly higher at 87.2%. Std = 0.010: The standard deviation is slightly larger (1%) compared to 3-fold CV. This is expected because 10-fold CV divides the data into smaller subsets, potentially leading to more variation in results. What it tells you:

Your model is performing well and consistently during cross-validation. A higher number of folds (e.g., 10) provides a more reliable estimate of model performance. Best Parameters classifier__C: 1: This is the regularization strength parameter. A value of 1 means moderate regularization, which balances underfitting and overfitting. classifier__penalty: l1: The model uses L1 regularization (Lasso), which encourages sparsity by shrinking less important feature weights to zero. classifier__solver: saga: The saga solver is used, suitable for large datasets and supports L1 regularization. What it tells you:

The hyperparameter tuning identified the best combination of parameters that optimize the model’s accuracy during 3-fold cross-validation. Best Cross-Validation Accuracy 87.1%: This is the highest accuracy achieved during the 3-fold cross-validation after hyperparameter tuning. What it tells you:

Your optimized model performs well during cross-validation and is ready for further evaluation on unseen data. Training Accuracy with Best Model 87.3%: After training the best model on the entire training dataset, its accuracy is 87.3%. What it tells you:

The model performs similarly on the full training dataset compared to the cross-validation results (87.1–87.3%). This consistency is a good sign that the model isn’t overfitting or underfitting. Takeaways Consistency: The cross-validation and training accuracies are very close, showing the model generalizes well to new data. Model Quality: An accuracy of ~87% indicates the model is effective at predicting WeatherType. Next Steps: Evaluate the model on the test set (weather_test) to measure performance on unseen data. Analyze the feature importance or coefficients to understand which features contribute most to the predictions. Would you like guidance on test set evaluation or feature importance analysis?

# Split weather_train into features and target
X_train = weather_train.drop("WeatherType", axis=1)
y_train = weather_train["WeatherType"]

# Split weather_test into features and target
X_test = weather_test.drop("WeatherType", axis=1)
y_test = weather_test["WeatherType"]

# Define feature types
numerical_features = ['Temperature', 'Humidity', 'WindSpeed', 'Precipitation', 'AtmosphericPressure', 'UVIndex', 'Visibility']
categorical_features = ['CloudCoverDescription', 'Season', 'Location']

Key Changes: Confusion Matrix Logging:

Added confusion_matrix computation for TP (True Positives), TN (True Negatives), FP (False Positives), and FN (False Negatives). Logged these metrics in MLFlow. Cross-Validation Mean and Standard Deviation:

Extracted the mean (cv_mean) and standard deviation (cv_std) of F1-scores from GridSearchCV. Logged these metrics in MLFlow. Whole Training Data Results:

Computed the F1-score on the training data using the best model. Logged all metrics related to the training data. Outputs: Logged to MLFlow: Best hyperparameters. Cross-validation mean and standard deviation of F1-scores. Training F1-score and confusion matrix components (TP, TN, FP, FN). The trained logistic regression model. This ensures all the required metrics and results are available for analysis and comparison.

1.2.2. conclusion#

Metrics: Train F1-Score: 0.873 This indicates how well the model balances precision and recall. An F1-score close to 1 is ideal, meaning the model performs very well in identifying both positive and negative cases. Train True Positives (TP): 86 Correctly identified positive instances (e.g., instances where the event occurred and the model correctly predicted it). Train True Negatives (TN): 2196 Correctly identified negative instances (e.g., instances where the event did not occur and the model correctly predicted it). Train False Positives (FP): 271 Incorrectly identified instances as positive when they were actually negative. Train False Negatives (FN): 87

Incorrectly identified instances as negative when they were actually positive. CV Mean F1-Score: 0.87

High mean F1-score across cross-validation, suggesting the model performs well on different splits of the data. CV Std F1-Score: 0.0025

A small standard deviation indicates stability in performance across different folds, which is good. Evaluation: F1-Score: A value of around 0.87 indicates the model is doing a good job in balancing precision and recall.

A score near 1 would be excellent, but 0.87 is generally considered good for many machine learning problems. Confusion Matrix:

High true negatives and true positives suggest the model effectively predicts both classes. Low false negatives and false positives indicate the model is avoiding making incorrect predictions. Cross-validation Performance:

With a CV mean F1-score of 0.87 and a small standard deviation, the model shows consistent performance across different data splits. Conclusion: Your model appears to be performing well, especially with a high F1-score and stable cross-validation results. However, it could be further fine-tuned by adjusting hyperparameters or experimenting with other preprocessing techniques to improve performance. Let me know if you’d like to explore these options!

#Experiment1
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix

# Set up MLFlow
MLFLOW_TRACKING_URI = "https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'Indumathitv27'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'a47dfca47f1f5d628277b6598611dcb071281919'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("WeatherType_Classification_Final")

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('scaled_num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Logistic Regression pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

# Define hyperparameter grid
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__solver': ['lbfgs', 'liblinear']
}

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted', return_train_score=True)

# Train and evaluate
with mlflow.start_run(run_name="WeatherType_LogisticRegression"):
    # Perform grid search
    grid_search.fit(X_train, y_train)
    
    # Best estimator and cross-validation results
    best_model = grid_search.best_estimator_
    cv_results = grid_search.cv_results_
    cv_mean = grid_search.best_score_
    cv_std = cv_results['std_test_score'][grid_search.best_index_]

    # Predictions and metrics on the training data
    y_pred_train = best_model.predict(X_train)
    train_f1 = f1_score(y_train, y_pred_train, average='weighted')
    tn, fp, fn, tp = confusion_matrix(y_train, y_pred_train).ravel()[:4]
    
    # Log parameters and metrics
    mlflow.log_param("Best Hyperparameters", grid_search.best_params_)
    mlflow.log_metric("CV Mean F1-Score", cv_mean)
    mlflow.log_metric("CV Std F1-Score", cv_std)
    mlflow.log_metric("Train F1-Score", train_f1)
    mlflow.log_metric("Train True Positives", tp)
    mlflow.log_metric("Train True Negatives", tn)
    mlflow.log_metric("Train False Positives", fp)
    mlflow.log_metric("Train False Negatives", fn)
    
    # Log the trained model
    mlflow.sklearn.log_model(best_model, "LogisticRegressionModel", signature=mlflow.models.infer_signature(X_train, best_model.predict(X_train)))

# Print results
print("Best Hyperparameters:", grid_search.best_params_)
print("CV Mean F1-Score:", cv_mean)
print("CV Std F1-Score:", cv_std)
print("Train F1-Score:", train_f1)
print("Confusion Matrix on Training Data:")
print(f"TP: {tp}, TN: {tn}, FP: {fp}, FN: {fn}")
<Experiment: artifact_location='mlflow-artifacts:/ea34bd5bb85c405f9d0630b4df04817c', creation_time=1734669699470, experiment_id='9', last_update_time=1734669699470, lifecycle_stage='active', name='WeatherType_Classification_Final', tags={}>
GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['Temperature',
                                                                          'Humidity',
                                                                          'WindSpeed',
                                                                          'Precipitation',
                                                                          'AtmosphericPressure',
                                                                          'UVIndex',
                                                                          'Visibility']),
                                                                        ('scaled_num',
                                                                         MinMaxScaler(),
                                                                         ['Temperature',
                                                                          'Humidity',
                                                                          'WindSpeed',
                                                                          'Precipitation',
                                                                          'AtmosphericPressure',
                                                                          'UVIndex',
                                                                          'Visibility']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['CloudCoverDescription',
                                                                          'Season',
                                                                          'Location'])])),
                                       ('classifier',
                                        LogisticRegression(max_iter=1000,
                                                           random_state=42))]),
             param_grid={'classifier__C': [0.01, 0.1, 1, 10, 100],
                         'classifier__solver': ['lbfgs', 'liblinear']},
             return_train_score=True, scoring='f1_weighted')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
{'classifier__C': 100, 'classifier__solver': 'lbfgs'}
<mlflow.models.model.ModelInfo at 0x320e1b9b0>
🏃 View run WeatherType_LogisticRegression at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/9686eaee0cb54852b7cc76cb137069e2
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
Best Hyperparameters: {'classifier__C': 100, 'classifier__solver': 'lbfgs'}
CV Mean F1-Score: 0.8715684740595343
CV Std F1-Score: 0.002518410798795382
Train F1-Score: 0.8727676215461186
Confusion Matrix on Training Data:
TP: 85, TN: 2196, FP: 272, FN: 87
#experiment2
import os
import numpy as np
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import mlflow

# Set up MLFlow
MLFLOW_TRACKING_URI = "https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'Indumathitv27'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'a47dfca47f1f5d628277b6598611dcb071281919'
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("WeatherType_Classification_Final")

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# Define the log-transform function
def log_transform(x):
    return np.log1p(x)

# Models to train
models = {
    "LogisticRegression": LogisticRegression(),
    "RidgeClassifier": RidgeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(),
}

# Loop over models and log results
for model_name, model in models.items():
    # Create pipeline
    pipeline = make_pipeline(preprocessor, model)

    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train_encoded, cv=5, scoring="accuracy")
    mean_cv_score = np.mean(cv_scores)
    std_cv_score = np.std(cv_scores)

    # Create a custom run name
    run_name = f"{model_name}"  # You can add more details like timestamp, etc.

    # Log results in MLflow with custom run name
    with mlflow.start_run(run_name=run_name):  # Use the dynamic run_name
        # Log metrics
        mlflow.log_metrics({
            "cv_mean_accuracy": mean_cv_score,
            "cv_std_accuracy": std_cv_score,
        })

        # Log the model with its respective name
        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="model_artifacts",
            registered_model_name=model_name,  # Use model_name directly
        )

        print(f"Model {model_name} logged successfully with run name: {run_name}.")
<Experiment: artifact_location='mlflow-artifacts:/ea34bd5bb85c405f9d0630b4df04817c', creation_time=1734669699470, experiment_id='9', last_update_time=1734669699470, lifecycle_stage='active', name='WeatherType_Classification_Final', tags={}>
2024/12/20 00:19:57 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Registered model 'LogisticRegression' already exists. Creating a new version of this model...
2024/12/20 00:20:06 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LogisticRegression, version 5
Created version '5' of model 'LogisticRegression'.
<mlflow.models.model.ModelInfo at 0x32164ab40>
Model LogisticRegression logged successfully with run name: LogisticRegression.
🏃 View run LogisticRegression at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/3f857a79c99a4f978e9a207acef83c83
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
2024/12/20 00:20:11 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Registered model 'RidgeClassifier' already exists. Creating a new version of this model...
2024/12/20 00:20:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RidgeClassifier, version 5
Created version '5' of model 'RidgeClassifier'.
<mlflow.models.model.ModelInfo at 0x3200453d0>
Model RidgeClassifier logged successfully with run name: RidgeClassifier.
🏃 View run RidgeClassifier at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/288d4f281ad4474fbe70018a190fd6df
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
2024/12/20 00:20:26 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Registered model 'RandomForestClassifier' already exists. Creating a new version of this model...
2024/12/20 00:20:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 5
Created version '5' of model 'RandomForestClassifier'.
<mlflow.models.model.ModelInfo at 0x326348f80>
Model RandomForestClassifier logged successfully with run name: RandomForestClassifier.
🏃 View run RandomForestClassifier at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/2dc377b7964f4564a33d3d7902c73b26
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
2024/12/20 00:20:39 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Registered model 'XGBClassifier' already exists. Creating a new version of this model...
2024/12/20 00:20:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBClassifier, version 3
Created version '3' of model 'XGBClassifier'.
<mlflow.models.model.ModelInfo at 0x321bd6c00>
Model XGBClassifier logged successfully with run name: XGBClassifier.
🏃 View run XGBClassifier at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/6fbc65e6bbd54e299f5baa453237bc27
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9

Differences Between Logistic Regression in Experiment #1 and #2 Experiment #1: Focus on Logistic Regression Only:

The entire experiment is dedicated to preprocessing pipelines for Logistic Regression with parameter tuning and cross-validation. More in-depth exploration of Logistic Regression: Parameter hyper-tuning (e.g., C, solver, penalty). Logging specific details like mean/std of cross-validation and results on the entire training data. Key Objective:

Optimize the Logistic Regression model and evaluate its detailed performance. Experiment #2: Part of a Comparative Analysis:

Logistic Regression is included as one of multiple classifiers (RidgeClassifier, RandomForestClassifier, XGBClassifier) for comparison. There is no parameter tuning for Logistic Regression here (unless you explicitly add it). Key Objective:

Compare the performance of different classifiers under the same preprocessing pipeline without delving deeply into Logistic Regression. Why Repeat Logistic Regression? The reason Logistic Regression is included in Experiment #2 is to:

Provide a baseline for comparison against the other classifiers. Evaluate if models like RidgeClassifier, RandomForestClassifier, or XGBClassifier outperform Logistic Regression. Should It Be Repeated? Yes, it is valid to repeat Logistic Regression in Experiment #2 because:

The purpose of Experiment #2 is to compare multiple classifiers. Including Logistic Regression in this comparison helps you understand its relative performance against the others. Experiment #1 focuses on optimizing Logistic Regression, while Experiment #2 uses it as a baseline without additional fine-tuning. If you prefer not to include it in Experiment #2, you could skip Logistic Regression and focus only on the other classifiers. However, it is recommended to keep it for comparative purposes.

1.3. Feature Engineering:#

Added the derived features from your dataset’s insights: Temp_Precip_Ratio: Indicates the balance between temperature and precipitation. Humidity_Visibility_Product: Correlation between high humidity and visibility. UV_Humidity_Interaction: Captures UV index effects when interacting with humidity. Pressure_Season_Interaction: Models seasonal effects on atmospheric pressure.

1.Temp_Precip_Ratio (Temperature / (Precipitation + 1e-6)) Why: This feature attempts to capture the interplay between temperature and precipitation. Correlation Insight: Both temperature and precipitation are correlated with the target WeatherType (based on your correlation matrix). Their ratio could highlight situations like: High temperature but low precipitation, possibly indicating dry weather. Low temperature but high precipitation, possibly indicating snowy or wet weather. Feature Engineering Intuition: Ratios often help reveal contrasts or relationships that aren’t immediately obvious in raw features.

2.Humidity_Visibility_Product (Humidity * Visibility) Why: This feature explores how visibility is affected by high or low humidity. Correlation Insight: Humidity is negatively correlated with visibility (from the matrix) but positively correlated with precipitation and WeatherType. By combining these two, the feature may highlight patterns like: High humidity with low visibility, possibly indicating foggy or overcast weather. Low humidity with high visibility, indicating clearer weather conditions. Feature Engineering Intuition: Products of features with complementary effects can capture nonlinear relationships that influence the target variable.

These were chosen based on:

Correlation Analysis: Your correlation matrix and summary indicate strong relationships between WeatherType and features like Temperature, Precipitation, Humidity, and Visibility. Domain Knowledge: Weather conditions are often defined by interactions between temperature, precipitation, and humidity (e.g., dry vs. wet, hot vs. cold). Visibility and humidity are commonly tied to weather conditions like fog or haze. Feature Engineering Strategies: Ratios and products are common transformations that help extract hidden relationships. They can reduce multicollinearity (e.g., when features are highly correlated with one another).

#Experiment3
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from xgboost import XGBClassifier  # Ensure XGBoost is installed
import mlflow
import mlflow.sklearn

# ----- Label Encoding -----
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Encode training labels
y_test_encoded = label_encoder.transform(y_test)        # Encode testing labels

# ----- Feature Engineering -----
X_train['Temp_Precip_Ratio'] = X_train['Temperature'] / (X_train['Precipitation'] + 1e-6)
X_train['Humidity_Visibility_Product'] = X_train['Humidity'] * X_train['Visibility']
X_train['UV_Humidity_Interaction'] = X_train['UVIndex'] * X_train['Humidity']
X_train['Pressure_Season_Interaction'] = X_train['AtmosphericPressure'] * X_train['Season'].map({
    'Winter': 1, 'Spring': 2, 'Summer': 3, 'Autumn': 4  # Encode season for interaction
})

X_test['Temp_Precip_Ratio'] = X_test['Temperature'] / (X_test['Precipitation'] + 1e-6)
X_test['Humidity_Visibility_Product'] = X_test['Humidity'] * X_test['Visibility']
X_test['UV_Humidity_Interaction'] = X_test['UVIndex'] * X_test['Humidity']
X_test['Pressure_Season_Interaction'] = X_test['AtmosphericPressure'] * X_test['Season'].map({
    'Winter': 1, 'Spring': 2, 'Summer': 3, 'Autumn': 4
})

numerical_features.extend(['Temp_Precip_Ratio', 'Humidity_Visibility_Product', 
                           'UV_Humidity_Interaction', 'Pressure_Season_Interaction'])

# ----- Preprocessing Pipeline -----
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

# ----- Classifiers -----
classifiers = {
    "RandomForestClassifier": RandomForestClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RidgeClassifier": RidgeClassifier(random_state=42),
    "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# ----- MLFlow Setup -----
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("WeatherType_Classification_Final")

# ----- Training and Logging -----
for model_name, model in classifiers.items():
    # Define pipeline for the current classifier
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Start an MLFlow run for the current classifier
    with mlflow.start_run(run_name=f"{model_name}_Experiment3"):
        # Train the model
        model_pipeline.fit(X_train, y_train_encoded)
        
        # Evaluate the model
        y_pred_encoded = model_pipeline.predict(X_test)
        f1 = f1_score(y_test_encoded, y_pred_encoded, average='weighted')
        cm = confusion_matrix(y_test_encoded, y_pred_encoded)
        
        # Log parameters, metrics, and artifacts
        mlflow.log_param("model", model_name)
        mlflow.log_param("feature_engineering", "Temp/Precip ratio, Humidity/Visibility product, UV/Humidity interaction, Pressure/Season interaction")
        mlflow.log_metric("F1 Score", f1)
        
        # Log confusion matrix for multi-class classification
        for i, class_name in enumerate(label_encoder.classes_):
            tp = cm[i, i]  # True Positive for the current class
            fp = cm[:, i].sum() - tp  # False Positive
            fn = cm[i, :].sum() - tp  # False Negative
            tn = cm.sum() - (tp + fp + fn)  # True Negative
            
            # Log these metrics in MLFlow
            mlflow.log_metric(f"{class_name}_True_Positive", tp)
            mlflow.log_metric(f"{class_name}_False_Positive", fp)
            mlflow.log_metric(f"{class_name}_False_Negative", fn)
            mlflow.log_metric(f"{class_name}_True_Negative", tn)

        mlflow.sklearn.log_model(model_pipeline, f"{model_name}_pipeline")
        
        print(f"Experiment with {model_name} completed: Results logged to MLFlow.")

# Decode predictions back to original labels if needed
decoded_predictions = label_encoder.inverse_transform(y_pred_encoded)
<Experiment: artifact_location='mlflow-artifacts:/ea34bd5bb85c405f9d0630b4df04817c', creation_time=1734669699470, experiment_id='9', last_update_time=1734669699470, lifecycle_stage='active', name='WeatherType_Classification_Final', tags={}>
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Temperature', 'Humidity',
                                                   'WindSpeed', 'Precipitation',
                                                   'AtmosphericPressure',
                                                   'UVIndex', 'Visibility',
                                                   'Temp_Precip_Ratio',
                                                   'Humidity_Visibility_Product',
                                                   'UV_Humidity_Interaction',
                                                   'Pressure_Season_Interaction']),
                                                 ('cat', OneHotEncoder(),
                                                  ['CloudCoverDescription',
                                                   'Season', 'Location'])])),
                ('classifier', RandomForestClassifier(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
'RandomForestClassifier'
'Temp/Precip ratio, Humidity/Visibility product, UV/Humidity interaction, Pressure/Season interaction'
2024/12/20 00:21:11 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
<mlflow.models.model.ModelInfo at 0x32164ab40>
Experiment with RandomForestClassifier completed: Results logged to MLFlow.
🏃 View run RandomForestClassifier_Experiment3 at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/55d2fa8db1bd4b3fa218427253e408d5
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Temperature', 'Humidity',
                                                   'WindSpeed', 'Precipitation',
                                                   'AtmosphericPressure',
                                                   'UVIndex', 'Visibility',
                                                   'Temp_Precip_Ratio',
                                                   'Humidity_Visibility_Product',
                                                   'UV_Humidity_Interaction',
                                                   'Pressure_Season_Interaction']),
                                                 ('cat', OneHotEncoder(),
                                                  ['CloudCoverDescription',
                                                   'Season', 'Location'])])),
                ('classifier',
                 LogisticRegression(max_iter=1000, random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
'LogisticRegression'
'Temp/Precip ratio, Humidity/Visibility product, UV/Humidity interaction, Pressure/Season interaction'
2024/12/20 00:21:55 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
<mlflow.models.model.ModelInfo at 0x31bc9ade0>
Experiment with LogisticRegression completed: Results logged to MLFlow.
🏃 View run LogisticRegression_Experiment3 at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/c48419c35b81462a95ed66e5e70e9a7a
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Temperature', 'Humidity',
                                                   'WindSpeed', 'Precipitation',
                                                   'AtmosphericPressure',
                                                   'UVIndex', 'Visibility',
                                                   'Temp_Precip_Ratio',
                                                   'Humidity_Visibility_Product',
                                                   'UV_Humidity_Interaction',
                                                   'Pressure_Season_Interaction']),
                                                 ('cat', OneHotEncoder(),
                                                  ['CloudCoverDescription',
                                                   'Season', 'Location'])])),
                ('classifier', RidgeClassifier(random_state=42))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
'RidgeClassifier'
'Temp/Precip ratio, Humidity/Visibility product, UV/Humidity interaction, Pressure/Season interaction'
2024/12/20 00:22:24 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
<mlflow.models.model.ModelInfo at 0x302339400>
Experiment with RidgeClassifier completed: Results logged to MLFlow.
🏃 View run RidgeClassifier_Experiment3 at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/417630c07b0d4343b66ea2dc8e87715b
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
/opt/anaconda3/lib/python3.12/site-packages/xgboost/core.py:158: UserWarning: [00:22:33] WARNING: /Users/runner/work/xgboost/xgboost/src/learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Temperature', 'Humidity',
                                                   'WindSpeed', 'Precipitation',
                                                   'AtmosphericPressure',
                                                   'UVIndex', 'Visibility',
                                                   'Temp_Precip_Ratio',
                                                   'Humidity_Visibility_Product',
                                                   'UV_Humidity_Interaction',
                                                   'Pressure_Season_Interaction']),
                                                 ('cat', OneHotEncoder(),
                                                  ['CloudCoverDescription',
                                                   'Season...
                               feature_types=None, gamma=None, grow_policy=None,
                               importance_type=None,
                               interaction_constraints=None, learning_rate=None,
                               max_bin=None, max_cat_threshold=None,
                               max_cat_to_onehot=None, max_delta_step=None,
                               max_depth=None, max_leaves=None,
                               min_child_weight=None, missing=nan,
                               monotone_constraints=None, multi_strategy=None,
                               n_estimators=None, n_jobs=None,
                               num_parallel_tree=None,
                               objective='multi:softprob', ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
'XGBClassifier'
'Temp/Precip ratio, Humidity/Visibility product, UV/Humidity interaction, Pressure/Season interaction'
2024/12/20 00:22:53 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
<mlflow.models.model.ModelInfo at 0x3205dd400>
Experiment with XGBClassifier completed: Results logged to MLFlow.
🏃 View run XGBClassifier_Experiment3 at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/c5f48ce119484dd3b6a34bad83a6cb6e
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
X_train.head(5)
Temperature Humidity WindSpeed Precipitation CloudCoverDescription AtmosphericPressure UVIndex Season Visibility Location Temp_Precip_Ratio Humidity_Visibility_Product UV_Humidity_Interaction Pressure_Season_Interaction
13143 -7.0 70.0 1.5 72.0 overcast 989.30 1.0 Winter 2.0 mountain -0.097222 140.0 70.0 989.30
5670 -2.0 60.0 13.5 75.0 overcast 997.81 1.0 Winter 4.5 mountain -0.026667 270.0 60.0 997.81
1113 1.0 95.0 4.0 97.0 overcast 987.61 0.0 Winter 3.0 inland 0.010309 285.0 0.0 987.61
2602 20.0 59.0 10.0 16.0 partly cloudy 1015.12 10.0 Summer 9.5 mountain 1.250000 560.5 590.0 3045.36
10526 29.0 32.0 3.0 15.0 clear 1012.12 11.0 Spring 6.0 coastal 1.933333 192.0 352.0 2024.24
import os
import numpy as np
import mlflow
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

mlflow.set_tracking_uri("https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow")
mlflow.set_experiment("WeatherType_Classification_Final")

# Encode categorical features
label_encoder = LabelEncoder()
X_train_encoded = X_train.copy()

# Encoding categorical features
X_train_encoded['CloudCoverDescription'] = label_encoder.fit_transform(X_train_encoded['CloudCoverDescription'])
X_train_encoded['Season'] = label_encoder.fit_transform(X_train_encoded['Season'])
X_train_encoded['Location'] = label_encoder.fit_transform(X_train_encoded['Location'])

# Feature selection methods
def correlation_threshold(X, threshold=0.9):
    # Compute the correlation matrix and drop features with correlation higher than the threshold
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    return X.drop(columns=to_drop), to_drop

def feature_importance(X, y, model=None, threshold=0.01):
    if model is None:
        model = RandomForestClassifier(n_estimators=100)
    model.fit(X, y)
    importances = model.feature_importances_
    important_features = X.columns[importances >= threshold]
    return X[important_features], list(set(X.columns) - set(important_features))

def variance_threshold(X, threshold=0.01):
    selector = VarianceThreshold(threshold=threshold)
    X_new = selector.fit_transform(X)
    selected_columns = X.columns[selector.get_support()]
    return X[selected_columns], list(set(X.columns) - set(selected_columns))

# Log feature selection methods
with mlflow.start_run(run_name="FeatureSelection_Experiment4"):
    
    # 1. Correlation Threshold
    X_corr, dropped_corr = correlation_threshold(X_train_encoded, threshold=0.75)
    mlflow.log_params({"correlation_threshold": 0.75, "dropped_features_corr": dropped_corr})
    
    # 2. Feature Importance
    X_important, dropped_imp = feature_importance(X_train_encoded, y_train, model=RandomForestClassifier(), threshold=0.05)
    mlflow.log_params({"threshold_feature_importance": 0.05, "dropped_features_imp": dropped_imp})
    
    # 3. Variance Threshold
    X_variance, dropped_var = variance_threshold(X_train_encoded, threshold=0.06)
    mlflow.log_params({"variance_threshold": 0.06, "dropped_features_var": dropped_var})
    
    # Cross-validation and accuracy for final selected features (after feature selection)
    def cross_val_accuracy(X, y):
        model = RandomForestClassifier(n_estimators=100)
        return np.mean(cross_val_score(model, X, y, cv=5, scoring='accuracy'))
    
    # Log metrics for all feature selection methods
    mlflow.log_metrics({
        "cv_accuracy_corr": cross_val_accuracy(X_corr, y_train),
        "cv_accuracy_imp": cross_val_accuracy(X_important, y_train),
        "cv_accuracy_var": cross_val_accuracy(X_variance, y_train),
    })
    
    print(f"Logged feature selection and accuracy metrics in MLFlow.")

# Display results
print(f"Dropped Features due to Correlation: {dropped_corr}")
print(f"Dropped Features due to Feature Importance: {dropped_imp}")
print(f"Dropped Features due to Variance Threshold: {dropped_var}")
<Experiment: artifact_location='mlflow-artifacts:/ea34bd5bb85c405f9d0630b4df04817c', creation_time=1734669699470, experiment_id='9', last_update_time=1734669699470, lifecycle_stage='active', name='WeatherType_Classification_Final', tags={}>
Logged feature selection and accuracy metrics in MLFlow.
🏃 View run FeatureSelection_Experiment4 at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/8f23b3e2feba4ab7a2d74ccb9f9a9a1b
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
Dropped Features due to Correlation: ['Humidity_Visibility_Product', 'UV_Humidity_Interaction', 'Pressure_Season_Interaction']
Dropped Features due to Feature Importance: ['UV_Humidity_Interaction', 'WindSpeed', 'Location', 'Season', 'Humidity', 'Humidity_Visibility_Product']
Dropped Features due to Variance Threshold: []
import os
import numpy as np
import mlflow
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

mlflow.set_tracking_uri("https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow")
mlflow.set_experiment("WeatherType_Classification_Final")

# Preprocessing pipeline (standardization is important for PCA)
scaler = StandardScaler()

# Define PCA transformation
def apply_pca(X, n_components=None):
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    explained_variance_ratio = pca.explained_variance_ratio_
    return X_pca, explained_variance_ratio, pca

# Scree plot to visualize the explained variance
def plot_scree(explained_variance_ratio):
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
    plt.title('Scree Plot')
    plt.xlabel('Principal Component')
    plt.ylabel('Explained Variance Ratio')
    plt.grid(True)
    plt.show()

# Log PCA and classification metrics
with mlflow.start_run(run_name="PCA_Dimensionality_Reduction"):
    # Standardize the data before applying PCA
    X_scaled = scaler.fit_transform(X_train_encoded)  # Assuming X_train is already prepared

    # Apply PCA and get explained variance ratio
    X_pca, explained_variance_ratio, pca = apply_pca(X_scaled, n_components=5)  # Choose 5 components or None for all
    
    # Log PCA results
    mlflow.log_params({"n_components": 5, "explained_variance_ratio": explained_variance_ratio.tolist()})
    
    # Print the components selected for classification
    components_df = pd.DataFrame(pca.components_, columns=X_train_encoded.columns)
    print("Selected Principal Components:")
    print(components_df)

    # Create a scree plot
    plot_scree(explained_variance_ratio)

    # Use the transformed data for classification
    model = RandomForestClassifier(n_estimators=100)
    cross_val_score_model = cross_val_score(model, X_pca, y_train, cv=5, scoring='accuracy')
    mean_cv_score = np.mean(cross_val_score_model)
    
    # Log accuracy score
    mlflow.log_metric("cv_accuracy_pca", mean_cv_score)
    
    print(f"Logged PCA results with {mean_cv_score:.4f} accuracy to MLFlow.")
<Experiment: artifact_location='mlflow-artifacts:/ea34bd5bb85c405f9d0630b4df04817c', creation_time=1734669699470, experiment_id='9', last_update_time=1734669699470, lifecycle_stage='active', name='WeatherType_Classification_Final', tags={}>
Selected Principal Components:
   Temperature  Humidity  WindSpeed  Precipitation  CloudCoverDescription  \
0     0.318230 -0.307967  -0.196588      -0.341445              -0.214744   
1     0.187871  0.403225   0.458212       0.364676               0.067473   
2    -0.089703 -0.020359  -0.016181      -0.015413              -0.246401   
3    -0.148146  0.232370  -0.023001       0.109567               0.314169   
4    -0.121050 -0.000093   0.031405       0.025484              -0.046113   

   AtmosphericPressure   UVIndex    Season  Visibility  Location  \
0             0.164372  0.351835 -0.297979    0.370866 -0.125240   
1             0.036160  0.273735 -0.203250   -0.167205 -0.098644   
2            -0.069922  0.402947  0.524336    0.104205  0.130055   
3            -0.119731 -0.159241  0.042520    0.486522  0.032161   
4            -0.405439  0.049595 -0.239572    0.026157  0.762337   

   Temp_Precip_Ratio  Humidity_Visibility_Product  UV_Humidity_Interaction  \
0           0.083187                     0.242250                 0.230161   
1          -0.093687                     0.046204                 0.494858   
2           0.113944                     0.085063                 0.404694   
3          -0.198859                     0.700455                -0.033072   
4           0.361233                     0.014495                 0.049240   

   Pressure_Season_Interaction  
0                     0.301886  
1                     0.208759  
2                    -0.525712  
3                    -0.048026  
4                     0.205875  
../../_images/0dedd7748f05529660a9c381c4905c57c08761299718eb235272fa98863c1a03.png
Logged PCA results with 0.8832 accuracy to MLFlow.
🏃 View run PCA_Dimensionality_Reduction at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/7a67e8d3863c4d319316e8f7b1173d8e
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9

1.3.1. Key Points:#

Ensemble Model: The VotingClassifier combines Random Forest, Logistic Regression, SVC, and XGBoost models. This is a simple way to aggregate the predictions from multiple models and might help improve accuracy. Hyperparameter Tuning: GridSearchCV is used here, but you can replace it with Optuna for more advanced hyperparameter search. Metrics: The classification report is logged for key metrics like accuracy, F1-score, and macro-average F1. Logging: All hyperparameters, metrics, and models are logged in MLFlow for easy tracking and comparison. Expected Outcome: Model Comparison: Compare how the ensemble model performs relative to individual models like Random Forest, Logistic Regression, and XGBoost. Custom Results: Log detailed results for evaluation, including accuracy, precision, recall, and F1-scores. Tracking and Visualizing: All of this will be tracked in MLFlow for easy visualization and analysis. This experiment allows you to explore different strategies to combine models and improve predictions for your weather type classification task while keeping track of everything for comparison.

import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Start MLFlow run
with mlflow.start_run(run_name="FeatureSelection_RandomForest_Experiment6"):
    # Feature Selection using SelectKBest (ANOVA F-statistic)
    selector = SelectKBest(score_func=f_classif, k=10)  # Select top 10 features
    X_train_selected = selector.fit_transform(X_train_encoded, y_train)
    selected_features = X_train_encoded.columns[selector.get_support()]

    # Standardize the selected features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_selected)

    # Training RandomForestClassifier
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2, random_state=42)  # Example hyperparameters
    rf.fit(X_train_scaled, y_train)

    # Evaluate model
    y_pred = rf.predict(X_train_scaled)
    accuracy = accuracy_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred, average='weighted')
    
    # Log metrics and model
    mlflow.log_params({
        "k_best_features": 10,
        "rf_n_estimators": rf.n_estimators,
        "rf_max_depth": rf.max_depth,
        "rf_min_samples_split": rf.min_samples_split,
        "rf_random_state": rf.random_state,
    })
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    mlflow.sklearn.log_model(rf, "random_forest_model")

    print(f"Custom Experiment complete with Feature Selection and RandomForestClassifier, accuracy: {accuracy}.")

# Results will be logged in MLFlow
RandomForestClassifier(max_depth=10, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
2024/12/20 02:19:15 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
<mlflow.models.model.ModelInfo at 0x3263f0f80>
Custom Experiment complete with Feature Selection and RandomForestClassifier, accuracy: 0.9830492424242424.
🏃 View run FeatureSelection_RandomForest_Experiment6 at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/8fdcd91b3a724dc3868f3dd968ba1353
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9

1.3.2. Experiment7#

Key Components: Dimensionality Reduction (PCA): PCA is used to reduce the feature space to retain the most important components, which helps with improving model performance by removing redundant features and reducing overfitting.

Feature Selection (SelectKBest): SelectKBest selects the top K features based on a statistical test (e.g., ANOVA F-value). In this case, we are selecting the top 10 features that are most important for classification.

Model Comparison: We use a variety of models (Random Forest, Logistic Regression, SVM, XGBoost) and tune their hyperparameters using GridSearchCV to find the best-performing versions of these models.

Voting Classifier: A Voting Classifier is used to combine the predictions of the best-performing individual models. This helps in creating a more robust and reliable model.

MLFlow Logging: Parameters, metrics, and models are logged to MLFlow, allowing for easy tracking, comparison, and reproducibility of the experiment.

Advantages: Dimensionality Reduction and Feature Selection: By reducing the feature space and selecting the most relevant features, we can improve model performance and reduce overfitting. Model Comparison: We evaluate different models and use an ensemble approach to improve accuracy. MLFlow Tracking: All results are logged and tracked, making it easier to compare experiments and replicate results. Possible Improvements: Alternative Feature Selection Techniques: You can experiment with other feature selection techniques, like Recursive Feature Elimination (RFE) or L1-based feature selection. Ensemble Variants: Try using other ensemble methods like Bagging or Boosting (e.g., AdaBoost, Gradient Boosting). Additional Hyperparameter Tuning: You can expand the hyperparameter search space or use more advanced hyperparameter optimization techniques like RandomizedSearchCV or Bayesian Optimization. This experiment gives you a comprehensive approach to compare models, reduce dimensionality, select important features, and combine them using an ensemble technique. All results are logged and tracked with MLFlow, ensuring reproducibility.

import mlflow
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Start MLFlow run
with mlflow.start_run(run_name="Ensemble_VotingClassifier_Experiment_With_F1"):
    # Preprocessing pipeline
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_encoded)

    # Base classifiers
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=2)  # Example hyperparameters
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)  # Example hyperparameters
    
    # Ensemble model using Voting Classifier
    voting_clf = VotingClassifier(estimators=[
        ('rf', rf),
        ('gbc', gbc),
    ], voting='soft')
    
    # Fit the model
    voting_clf.fit(X_train_scaled, y_train)

    # Evaluate model
    y_pred = voting_clf.predict(X_train_scaled)
    accuracy = accuracy_score(y_train, y_pred)
    f1 = f1_score(y_train, y_pred, average='weighted')

    # Log metrics and model
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)

    # Log hyperparameters
    mlflow.log_params({
        'rf_n_estimators': rf.n_estimators,
        'rf_max_depth': rf.max_depth,
        'rf_min_samples_split': rf.min_samples_split,
        'gbc_n_estimators': gbc.n_estimators,
        'gbc_learning_rate': gbc.learning_rate,
        'gbc_max_depth': gbc.max_depth,
    })

    mlflow.sklearn.log_model(voting_clf, "voting_classifier_model")

    print(f"Custom Ensemble Experiment complete with Voting Classifier, accuracy: {accuracy}, F1-score: {f1}.")

# Results will be logged in MLFlow
VotingClassifier(estimators=[('rf', RandomForestClassifier(max_depth=10)),
                             ('gbc', GradientBoostingClassifier())],
                 voting='soft')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
2024/12/20 02:17:22 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
<mlflow.models.model.ModelInfo at 0x31bbcb740>
Custom Ensemble Experiment complete with Voting Classifier, accuracy: 0.9755681818181818, F1-score: 0.9756306684644996.
🏃 View run Ensemble_VotingClassifier_Experiment_With_F1 at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9/runs/b00c4951ef09459b91708cb3b2c9fe41
🧪 View experiment at: https://dagshub.com/Indumathitv27/WeatherTypeClassification_FinalProject.mlflow/#/experiments/9
import matplotlib.pyplot as plt

# Example data
models = ['LogisticRegression', 'RidgeClassifier', 'RandomForestClassifier', 'XGBClassifier','Ensemble_VotingClassifier','FeatureSelection_RandomForest']
f1_scores = [0.88, 0.84, 0.92, 0.919, 0.975, 0.983]  # Corresponding F1 scores

# Create bar graph
plt.figure(figsize=(10, 6))
plt.bar(models, f1_scores, color='skyblue')

# Adding title and labels
plt.title('F1 Scores for Different Models')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.grid(axis='y', linestyle='--', alpha=0.7)  # Add grid lines for y-axis

# Show plot
plt.tight_layout()  # Adjust layout
plt.show()
<Figure size 1000x600 with 0 Axes>
<BarContainer object of 6 artists>
Text(0.5, 1.0, 'F1 Scores for Different Models')
Text(0.5, 0, 'Models')
Text(0, 0.5, 'F1 Score')
([0, 1, 2, 3, 4, 5],
 [Text(0, 0, 'LogisticRegression'),
  Text(1, 0, 'RidgeClassifier'),
  Text(2, 0, 'RandomForestClassifier'),
  Text(3, 0, 'XGBClassifier'),
  Text(4, 0, 'Ensemble_VotingClassifier'),
  Text(5, 0, 'FeatureSelection_RandomForest')])
../../_images/9afe5a2f3b9f0f3f9b33a3123ab4aaa0ac79422cd3acbc019f9b82bfaa9329da.png
import pandas as pd
import joblib

# Example: Save model metadata
metadata = {
    'model_name': ['FeatureSelection_RandomForest'],
    'f1_score': [0.983],
    'hyperparameters': ['k_best_features=10,rf_n_estimators=100,rf_max_depth=10,rf_min_samples_split=2,rf_random_state=42']
}
df = pd.DataFrame(metadata)
df.to_csv('model_metadata.csv', index=False)

# Save the model
joblib.dump(model, 'random_forest_model.pkl')
['random_forest_model.pkl']